1 load data set precalculated p-values

helper function

loadRData <- function(fileName){
#loads an RData file, and returns it
    load(fileName)
    get(ls()[ls() != "fileName"])
}

1.1 data set 1

The following is the same same data set as in hQTL analysis as in https://rss.onlinelibrary.wiley.com/doi/10.1111/rssb.12411?af=R, Section 6: APPLICATION EXAMPLE: BIOLOGICAL HIGH-THROUGHPUT DATA. It is taken from Grubert et al..

chr21_full_data_grubert <- loadRData(here("data/hqtl_chrom1_chrom2/snps21.filtered.allGT.txt.rda"))

Sample names (The population sample is 76 individuals).

MatrixEQTL::colnames(chr21_full_data_grubert)[1:10]
##  [1] "NA19160" "NA18861" "NA18862" "NA18909" "NA18907" "NA18517" "NA19119"
##  [8] "NA19116" "NA19114" "NA19171"

SNPs

MatrixEQTL::rownames(chr21_full_data_grubert)[1:10]
##  [1] "rs183048582" "rs187728300" "rs185808153" "rs150746917" "rs140976465"
##  [6] "rs146497069" "rs76813625"  "rs137909700" "rs71262411"  "rs71269378"

top-left submatrix

chr21_full_data_grubert[[1]][1:10,1:10]
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]    0    0    0    0    0    0    1    0    0     0
##  [2,]    0    0    0    0    0    0    1    0    0     0
##  [3,]    0    0    0    1    0    0    1    1    0     0
##  [4,]    0    0    0    0    0    0    0    0    0     0
##  [5,]    0    0    0    0    0    0    0    0    0     0
##  [6,]    0    0    0    0    1    0    0    0    0     0
##  [7,]    2    2    2    2    1    2    2    2    2     2
##  [8,]    0    1    0    0    0    0    0    1    0     0
##  [9,]    2    2    2    2    2    2    1    2    2     2
## [10,]    2    2    2    2    2    2    1    2    2     2

0 means no alternative allele, 1 means 1 alternative allele, 2 means 2 alternative allele,

The p-values are pre-calculated with MatrixEQTL package ref. It is basically row-wise t-test with covariate adjustment. Here is the code. The pre-calculated file was downloaded from here.

chr21_pvalues_grubert <- data.table::fread(here::here("data/hqtl_chrom1_chrom2/results/cisQTLs_H3K27AC_chr21.txt"))
chr21_pvalues_grubert <- chr21_pvalues_grubert %>% rename(tstat = `t-stat`, pvalue = `p-value`)
head(chr21_pvalues_grubert)
##            SNP  gene        beta      tstat    pvalue
## 1: rs183048582 64684 -0.30947859 -1.1006867 0.2746493
## 2: rs183048582 64685  0.03130456  0.1101570 0.9125872
## 3: rs183048582 64686 -0.04461388 -0.1545630 0.8775924
## 4: rs183048582 64687  0.56033736  2.0317715 0.0458169
## 5: rs183048582 64688  0.09431013  0.3271629 0.7444810
## 6: rs183048582 64689 -0.03752318 -0.1300745 0.8968652
chr21_pvalues_grubert <- chr21_pvalues_grubert %>% sample_n(1000) #3000000

SNP,gene are primary keys.

chr21_pvalues_grubert %>%
  group_by(SNP,gene) %>%
  filter(n()>1)
range(chr21_pvalues_grubert$pvalue)
## [1] 0.0004891307 0.9994648355

1.2 data set 2

We consider a second data set from (Boca and Leek)[https://github.com/SiminaB/Fdr-regression/blob/master/BMI%20GIANT%20meta-analysis/BMI_GIANT_GWAS_results_Scott_theoretical.RData].

pvalues_maf_boca <- loadRData(here("data/BMI_GIANT_GWAS.RData"))
head(pvalues_maf_boca)
## # A tibble: 6 × 9
##   SNP        A1    A2    Freq_MAF_Hapmap       b     se      p      N
##   <chr>      <chr> <chr>           <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 rs1000000  G     A               0.367 -0.0001 0.0043 0.981  233572
## 2 rs10000010 T     C               0.425  0.0022 0.0029 0.438  339148
## 3 rs10000012 G     C               0.192 -0.0096 0.0053 0.0701 236095
## 4 rs10000013 A     C               0.167  0.0096 0.0043 0.0256 236048
## 5 rs10000017 C     T               0.233  0.0038 0.0045 0.398  235308
## 6 rs10000023 G     T               0.408  0.0023 0.0037 0.534  236022
## # … with 1 more variable: Freq_MAF_Int_Hapmap <fct>

TODO why are there multiple rows per SNP?

pvalues_maf_boca %>%
  group_by(SNP) %>%
  filter(n()>1)

1.3 harmonise data set 1 and 2

We calculate the standard deviation for the beta estimate of the effect size from the beta_sefrom beta and tstat.

chr21_pvalues_grubert <- chr21_pvalues_grubert %>%
  mutate(beta_se = beta/tstat,
         #tstat = NULL,
         FDR = NULL)

We harmonise the two data sets.

pvalues_maf_boca <- pvalues_maf_boca %>%
  dplyr::rename(beta = b, beta_se = se, pvalue = p, maf_sample = Freq_MAF_Hapmap) %>%
  mutate(Freq_MAF_Int_Hapmap = NULL,
         A1 = NULL,
         A2 = NULL,
         tstat = beta/beta_se)

colnames(chr21_pvalues_grubert)
## [1] "SNP"     "gene"    "beta"    "tstat"   "pvalue"  "beta_se"
colnames(pvalues_maf_boca)
## [1] "SNP"        "maf_sample" "beta"       "beta_se"    "pvalue"    
## [6] "N"          "tstat"

TODO how is Freq_MAF_Hapmap calculated? Is it from sample?

2 check t-distribution

2.1 data set 1

ggplot(chr21_pvalues_grubert, aes(x = tstat)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 30) +
  labs(title = "Histogram of tstat",
       x = "tstat",
       y = "Frequency")
An amazing plot

Figure 2.1: An amazing plot

qqPlot(chr21_pvalues_grubert$tstat, dist = "t", param.list = list(df = 74), add.line = TRUE)

TODO

qqPlot(chr21_pvalues_grubert$beta, dist = "norm", estimate.params = TRUE, add.line = TRUE)

#TODO more complicated because of \sqrt{s s_{x x} x}
qqPlot(chr21_pvalues_grubert$beta_se, dist = "chisq", param.list = list(df = 74), add.line = TRUE)

2.2 data set 2

qqPlot(pvalues_maf_boca$tstat, dist = "t", estimate.parameters = TRUE, add.line = TRUE)

3 calculate minor alle frequency

3.1 data set 1

This code is directly taken from here: http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/faq.html. Explenation: each entry of the matrix can take the values 0,1,2. rowMeans(slice,na.rm=TRUE)/2=rowSum(slice,na.rm=TRUE)/(2*76).

maf.list = vector('list', length(chr21_full_data_grubert))
for(sl in seq_along(chr21_full_data_grubert)) {
  slice = chr21_full_data_grubert[[sl]];
  maf.list[[sl]] = rowMeans(slice,na.rm=TRUE)/2;
  maf.list[[sl]] = pmin(maf.list[[sl]],1-maf.list[[sl]]);
}

We assign the maf to the snp identifier TODO mistake in this cell?

chr21_maf_sample <- data.frame(
  SNP = MatrixEQTL::rownames(chr21_full_data_grubert),
  maf_sample = unlist(maf.list)
)
chr21_pvalues_grubert <- chr21_pvalues_grubert %>%
  inner_join(chr21_maf_sample, by = "SNP") 
ggplot(chr21_pvalues_grubert, aes(x = maf_sample)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 30) +
  labs(title = "Histogram of MAF Sample",
       x = "MAF Sample",
       y = "Frequency")

This is plausible and checks out with what we would expect from the data set description.

range(chr21_pvalues_grubert$maf_sample)
## [1] 0.05263158 0.49342105

3.2 data set 2

already pre calculated

ggplot(pvalues_maf_boca, aes(x = maf_sample)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 30) +
  labs(title = "Histogram of MAF Sample",
       x = "MAF Sample",
       y = "Frequency")

The boca leek data set seems to be not filtered for significant SNPs.

range(pvalues_maf_boca$maf_sample)
## [1] 0.0 0.5

4 compare pvalues and standard deviation of effect size estimate

4.1 data set 1

The following plot makes sense. In chr21_pvalues_grubert we have filtered out SNPs with low MAF and high pvalues.

ggplot(chr21_pvalues_grubert,
       aes(x = beta_se , y = pvalue)) +
  scale_y_log10()+
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex()  #bins = 60

4.2 data set 2

TODO why is beta_se independent of pvalue.

ggplot(pvalues_maf_boca,
       aes(x = beta_se, y =  pvalue)) +
  scale_y_log10()+
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex()  #bins = 60

Reproducing what happens for data set 1.

ggplot(pvalues_maf_boca %>%
         filter(pvalue <= 10^(-2), maf_sample >= 0.05),
       aes(x = beta_se, y =  pvalue)) +
  scale_y_log10()+
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex()  #bins = 60

5 comparing p-values with maf sample

5.1 data set 1

maf_sample does not seem super informative.

ggplot(chr21_pvalues_grubert,
       aes(x = maf_sample, y = pvalue)) +
  scale_y_log10()+
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex()  #bins = 60

5.2 data set 2

Here the data set 2 exhibits a much stronger trend. maf_sample seems more informative.

ggplot(pvalues_maf_boca,
       aes(x = maf_sample, y = pvalue)) +
  scale_y_log10()+
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex()  #bins = 60

6 stratifying pvalues by beta std

groups_by_filter <- function(covariate, nbins, ties.method="random", seed=NULL){
  if (!is.null(seed) && ties.method=="random"){
    #http://stackoverflow.com/questions/14324096/setting-seed-locally-not-globally-in-r?rq=1
    tmp <- runif(1)
    old <- .Random.seed
    on.exit( { .Random.seed <<- old } )
    set.seed(as.integer(seed)) 
  }
    rfs <- rank(covariate, ties.method=ties.method)/length(covariate)
    as.factor(ceiling( rfs* nbins))
}

6.1 stratifying pvalues by beta std, data set 1

mutate(beta_se = beta/tstat)

ggplot(chr21_pvalues_grubert %>% select(pvalue), 
       aes(x = pvalue)) +
  geom_histogram(boundary = 0) +
  facet_wrap(groups_by_filter(chr21_pvalues_grubert$beta_se, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## data set 2

ggplot(pvalues_maf_boca, 
       aes(x = pvalue)) +
  geom_histogram(boundary = 0) +
  facet_wrap(groups_by_filter(pvalues_maf_boca$beta_se, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

7 stratifying pvalues by maf

7.1 data set 1

ggplot(chr21_pvalues_grubert %>% select(pvalue), 
       aes(x = pvalue)) +
  geom_histogram(boundary = 0, bins = 40) +
  facet_wrap(groups_by_filter(chr21_pvalues_grubert$maf_sample, 8), nrow = 2)

7.2 data set 2

ggplot(pvalues_maf_boca, 
       aes(x = pvalue)) +
  geom_histogram(boundary = 0) +
  facet_wrap(groups_by_filter(pvalues_maf_boca$maf_sample, 8), nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# compare minor alle frequency and standard deviation of effect size estimate

7.3 data set 1

The beta_se estimate is calculated from the pre-calculated p-value. So it is assumed, that the pre-calculated p-value is correct. This is plausible: std should decrease with smaller beta_se.

ggplot(chr21_pvalues_grubert,
       aes(x = maf_sample, y = beta_se)) +
  scale_y_log10()+
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex()  #bins = 60

ggplot(chr21_pvalues_grubert,
       aes(x = maf_downloaded, y = beta_se)) +
  scale_y_log10()+
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex()  #bins = 60

So the sample maf looks more informative

7.4 data set 2

The beta_se estimate is calculated from the pre-calculated p-value. So it is assumed, that the pre-calculated p-value is correct.

ggplot(pvalues_maf_boca,
       aes(x = maf_sample, y = beta_se)) +
  scale_y_log10()+
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex()  #bins = 60

8 download maf from online data base

8.1 data set 1

ensembl <- biomaRt::useMart("ENSEMBL_MART_SNP", dataset = "hsapiens_snp")

Download minor_allele_freq for first data set

chr21_maf_grubert_downloaded <- biomaRt::getBM(
  attributes = c("refsnp_id", "minor_allele_freq"),
  filters = "snp_filter", values = "rs1620325",#unique(chr21_pvalues_grubert$SNP),
  mart = ensembl, uniqueRows = TRUE
)
writeRDS(chr21_maf_grubert_downloaded, here("data/downloaded_covariates/chr21_maf.Rds"))

Load the pre-downloaded maf

chr21_maf_grubert_downloaded <- readRDS(here("data/downloaded_covariates/chr21_maf.Rds"))

join, filter out rows where downloaded maf is missing

chr21_maf_grubert_downloaded <- chr21_maf_grubert_downloaded %>% 
  dplyr::rename(maf_downloaded = minor_allele_freq)

chr21_pvalues_grubert <- chr21_pvalues_grubert %>%
  inner_join(chr21_maf_grubert_downloaded, by = c("SNP" = "refsnp_id")) %>%
  drop_na(maf_downloaded)
ggplot(chr21_pvalues_grubert, aes(x = maf_downloaded)) +
  geom_histogram(fill = "lightblue", color = "black", bins = 30) +
  labs(title = "Histogram of MAF Download",
       x = "MAF Download",
       y = "Frequency")
range(chr21_pvalues_grubert$maf_downloaded)

8.2 data set 2

chr21_maf <- biomaRt::getBM(
  attributes = c("refsnp_id", "minor_allele_freq"),
  filters = "snp_filter", values = unique(pvalues_maf_boca$SNP),
  mart = ensembl, uniqueRows = TRUE
)

9 comparing downloaded and calculated maf

9.1 data set 1

The sample MAF does not go below 0.05.

ggplot(chr21_pvalues_grubert,
       aes(x = maf_sample, y = maf_downloaded)) +
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex(bins = 60) +
  geom_abline(intercept=0, slope=1, color="red") +
  labs(x = "MAF Sample",
       y = "MAF Downloaded")
ggplot(chr21_pvalues_grubert,
       aes(x = rank(maf_sample), y = rank(maf_downloaded))) +
  scale_fill_gradient(name = "count", trans = "log10")+
  geom_hex(bins = 60) +
  geom_abline(intercept=0, slope=1, color="red") +
  labs(x = "MAF Sample Rank",
       y = "MAF Downloaded Rank")